EVALUACIÓN DE MODELOS

1. Importación y estandarización de la información

load("../data/Frecuencia_De_Accidentes_Mensual.Rda")
load("../data/Dias_Especiales_Mensuales.Rda")

Se crean las columnas de accidentes Graves y leves para saber la frecuencia por día

library(reshape)
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
Total_Dataset_Freq_M <- cast(Total_Dataset_Freq_M[,c(1,3,4,5)],ANO+MES~GRAVEDAD)
## Using FREQ as value column.  Use the value argument to cast to override this choice

Se agrega la columna TOTAL_ACCIDENTES

Total_Dataset_Freq_M$TOTAL_ACCIDENTES <- Total_Dataset_Freq_M$ACCIDENTES_GRAVES + Total_Dataset_Freq_M$ACCIDENTES_LEVES
Total_Dataset_Freq_M <- sqldf("SELECT* 
              FROM Total_Dataset_Freq_M FS
              LEFT JOIN Dias_Especiales_Mensuales DES 
              ON (FS.ANO=DES.ANO AND FS.MES=DES.MES)")
library(dplyr)
Total_Dataset_Freq_M<-unite_(Total_Dataset_Freq_M, "Ano_mes", c("ANO..6","MES..8"))
save(Total_Dataset_Freq_M,file="../Modelos/Total_Dataset_Freq_M_mensual.Rda")

2. Partición de los datos para entrenamiento y test

Se ajustarán modelos con la información disponible desde el 01 de enero de 2014 hasta el 31 de diciembre de 2017 y se utilizará el año 2018 para validar el modelo:

Train_M_Dataset <- subset(Total_Dataset_Freq_M, ANO!="2018")
summary(Train_M_Dataset $ANO)
##    Length     Class      Mode 
##        48 character character

Se ajustan otra vez los niveles del factor ANO

Train_M_Dataset $ANO <- factor(Train_M_Dataset $ANO)
summary(Train_M_Dataset $ANO)
## 2014 2015 2016 2017 
##   12   12   12   12
library(sqldf)
Test_M_Dataset <- sqldf("SELECT *  
       FROM Total_Dataset_Freq_M
       WHERE ANO == 2018")
summary(Test_M_Dataset$ANO)
##    Length     Class      Mode 
##        12 character character

Se ajustan otra vez los niveles del factor ANO

Test_M_Dataset$ANO <- factor(Test_M_Dataset$ANO)
summary(Test_M_Dataset$ANO)
## 2018 
##   12

3. Selección de las mejores variables para el modelo

Se utilizará el método forward selection para elegir las mejores variables explicativas del modelo teniendo como criterio aquellas variables que presente mejor R^2 ajustado

library (leaps)
regfit.fwd=regsubsets (TOTAL_ACCIDENTES∼Ano_Base+MES+Freq_lunes+Freq_martes+Freq_miercoles+Freq_jueves+Freq_viernes+Freq_sabado+Freq_domingo+Feriados+Feriados_Lunes+Feriados_Otro,Train_M_Dataset , method ="forward", nvmax= 80)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 1 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : nvmax reduced to 21
## Warning in rval$lopt[] <- rval$vorder[rval$lopt]: número de items para para
## sustituir no es un múltiplo de la longitud del reemplazo
summary (regfit.fwd)
## Subset selection object
## Call: regsubsets.formula(TOTAL_ACCIDENTES ~ Ano_Base + MES + Freq_lunes + 
##     Freq_martes + Freq_miercoles + Freq_jueves + Freq_viernes + 
##     Freq_sabado + Freq_domingo + Feriados + Feriados_Lunes + 
##     Feriados_Otro, Train_M_Dataset, method = "forward", nvmax = 80)
## 22 Variables  (and intercept)
##                Forced in Forced out
## Ano_Base           FALSE      FALSE
## MES02              FALSE      FALSE
## MES03              FALSE      FALSE
## MES04              FALSE      FALSE
## MES05              FALSE      FALSE
## MES06              FALSE      FALSE
## MES07              FALSE      FALSE
## MES08              FALSE      FALSE
## MES09              FALSE      FALSE
## MES10              FALSE      FALSE
## MES11              FALSE      FALSE
## MES12              FALSE      FALSE
## Freq_lunes         FALSE      FALSE
## Freq_martes        FALSE      FALSE
## Freq_miercoles     FALSE      FALSE
## Freq_jueves        FALSE      FALSE
## Freq_viernes       FALSE      FALSE
## Freq_sabado        FALSE      FALSE
## Freq_domingo       FALSE      FALSE
## Feriados           FALSE      FALSE
## Feriados_Lunes     FALSE      FALSE
## Feriados_Otro      FALSE      FALSE
## 1 subsets of each size up to 21
## Selection Algorithm: forward
##           Ano_Base MES02 MES03 MES04 MES05 MES06 MES07 MES08 MES09 MES10
## 1  ( 1 )  " "      " "   " "   " "   " "   " "   " "   "*"   " "   " "  
## 2  ( 1 )  " "      " "   " "   " "   " "   " "   " "   "*"   " "   " "  
## 3  ( 1 )  " "      "*"   " "   " "   " "   " "   " "   "*"   " "   " "  
## 4  ( 1 )  " "      "*"   " "   " "   "*"   " "   " "   "*"   " "   " "  
## 5  ( 1 )  " "      "*"   " "   " "   "*"   " "   " "   "*"   " "   " "  
## 6  ( 1 )  " "      "*"   " "   " "   "*"   " "   "*"   "*"   " "   " "  
## 7  ( 1 )  " "      "*"   "*"   " "   "*"   " "   "*"   "*"   " "   " "  
## 8  ( 1 )  " "      "*"   "*"   " "   "*"   " "   "*"   "*"   " "   " "  
## 9  ( 1 )  " "      "*"   "*"   " "   "*"   " "   "*"   "*"   " "   "*"  
## 10  ( 1 ) "*"      "*"   "*"   " "   "*"   " "   "*"   "*"   " "   "*"  
## 11  ( 1 ) "*"      "*"   "*"   " "   "*"   " "   "*"   "*"   "*"   "*"  
## 12  ( 1 ) "*"      "*"   "*"   "*"   "*"   " "   "*"   "*"   "*"   "*"  
## 13  ( 1 ) "*"      "*"   "*"   "*"   "*"   " "   "*"   "*"   "*"   "*"  
## 14  ( 1 ) "*"      "*"   "*"   "*"   "*"   " "   "*"   "*"   "*"   "*"  
## 15  ( 1 ) "*"      "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"  
## 16  ( 1 ) "*"      "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"  
## 17  ( 1 ) "*"      "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"  
## 18  ( 1 ) "*"      "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"  
## 19  ( 1 ) "*"      "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"  
## 20  ( 1 ) "*"      "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"  
## 21  ( 1 ) "*"      "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"   "*"  
##           MES11 MES12 Freq_lunes Freq_martes Freq_miercoles Freq_jueves
## 1  ( 1 )  " "   " "   " "        " "         " "            " "        
## 2  ( 1 )  " "   " "   " "        " "         " "            " "        
## 3  ( 1 )  " "   " "   " "        " "         " "            " "        
## 4  ( 1 )  " "   " "   " "        " "         " "            " "        
## 5  ( 1 )  " "   " "   " "        " "         " "            " "        
## 6  ( 1 )  " "   " "   " "        " "         " "            " "        
## 7  ( 1 )  " "   " "   " "        " "         " "            " "        
## 8  ( 1 )  " "   "*"   " "        " "         " "            " "        
## 9  ( 1 )  " "   "*"   " "        " "         " "            " "        
## 10  ( 1 ) " "   "*"   " "        " "         " "            " "        
## 11  ( 1 ) " "   "*"   " "        " "         " "            " "        
## 12  ( 1 ) " "   "*"   " "        " "         " "            " "        
## 13  ( 1 ) " "   "*"   " "        " "         " "            " "        
## 14  ( 1 ) "*"   "*"   " "        " "         " "            " "        
## 15  ( 1 ) "*"   "*"   " "        " "         " "            " "        
## 16  ( 1 ) "*"   "*"   " "        " "         " "            " "        
## 17  ( 1 ) "*"   "*"   " "        " "         " "            " "        
## 18  ( 1 ) "*"   "*"   " "        " "         "*"            " "        
## 19  ( 1 ) "*"   "*"   "*"        " "         "*"            " "        
## 20  ( 1 ) "*"   "*"   "*"        " "         "*"            "*"        
## 21  ( 1 ) "*"   "*"   "*"        "*"         "*"            "*"        
##           Freq_viernes Freq_sabado Freq_domingo Feriados Feriados_Lunes
## 1  ( 1 )  " "          " "         " "          " "      " "           
## 2  ( 1 )  " "          " "         " "          "*"      " "           
## 3  ( 1 )  " "          " "         " "          "*"      " "           
## 4  ( 1 )  " "          " "         " "          "*"      " "           
## 5  ( 1 )  "*"          " "         " "          "*"      " "           
## 6  ( 1 )  "*"          " "         " "          "*"      " "           
## 7  ( 1 )  "*"          " "         " "          "*"      " "           
## 8  ( 1 )  "*"          " "         " "          "*"      " "           
## 9  ( 1 )  "*"          " "         " "          "*"      " "           
## 10  ( 1 ) "*"          " "         " "          "*"      " "           
## 11  ( 1 ) "*"          " "         " "          "*"      " "           
## 12  ( 1 ) "*"          " "         " "          "*"      " "           
## 13  ( 1 ) "*"          " "         " "          "*"      " "           
## 14  ( 1 ) "*"          " "         " "          "*"      " "           
## 15  ( 1 ) "*"          " "         " "          "*"      " "           
## 16  ( 1 ) "*"          "*"         " "          "*"      " "           
## 17  ( 1 ) "*"          "*"         "*"          "*"      " "           
## 18  ( 1 ) "*"          "*"         "*"          "*"      " "           
## 19  ( 1 ) "*"          "*"         "*"          "*"      " "           
## 20  ( 1 ) "*"          "*"         "*"          "*"      " "           
## 21  ( 1 ) "*"          "*"         "*"          "*"      " "           
##           Feriados_Otro
## 1  ( 1 )  " "          
## 2  ( 1 )  " "          
## 3  ( 1 )  " "          
## 4  ( 1 )  " "          
## 5  ( 1 )  " "          
## 6  ( 1 )  " "          
## 7  ( 1 )  " "          
## 8  ( 1 )  " "          
## 9  ( 1 )  " "          
## 10  ( 1 ) " "          
## 11  ( 1 ) " "          
## 12  ( 1 ) " "          
## 13  ( 1 ) "*"          
## 14  ( 1 ) "*"          
## 15  ( 1 ) "*"          
## 16  ( 1 ) "*"          
## 17  ( 1 ) "*"          
## 18  ( 1 ) "*"          
## 19  ( 1 ) "*"          
## 20  ( 1 ) "*"          
## 21  ( 1 ) "*"
reg.summary =summary(regfit.fwd)
names(reg.summary)
## [1] "which"  "rsq"    "rss"    "adjr2"  "cp"     "bic"    "outmat" "obj"
reg.summary$rsq
##  [1] 0.1195680 0.2016062 0.3168790 0.4057561 0.4621648 0.5081664 0.5629993
##  [8] 0.6187559 0.6594781 0.6963405 0.7262180 0.7430882 0.8081586 0.8238334
## [15] 0.8695697 0.8710930 0.8758966 0.8767008 0.8778598 0.8780811 0.8812346

Selección de variables con el mejor R^2 ajustado

max_adjr<-which.max (reg.summary$adjr2)
max_adjr
## [1] 15
par(mfrow =c(2,2))
plot(reg.summary$rss ,xlab=" Número de Variables ",ylab=" RSS",
type="l")
plot(reg.summary$adjr2 ,xlab =" Número de Variables ",
ylab=" Adjusted RSq",type="l")

points (max_adjr, reg.summary$adjr2[max_adjr], col ="red",cex =2, pch =20)

plot(regfit.fwd ,scale ="adjr2")

Variables seleccionadas

coef(regfit.fwd ,max_adjr)
##   (Intercept)      Ano_Base         MES02         MES03         MES04 
##    3183.96697      74.76902     167.14852     557.22647     395.02177 
##         MES05         MES06         MES07         MES08         MES09 
##     619.98671     283.32424     558.98976     708.79724     512.27746 
##         MES10         MES11         MES12  Freq_viernes      Feriados 
##     493.77135     308.38629     509.45818     -55.75788     -53.99389 
## Feriados_Otro 
##     -45.29530

4. Modelamiento

set.seed(123) # fija la semilla del generador de parámetros para que sea reproducible

Se realiza el modelo de regresión lineal con las variables seleccionadas y se revisa el p-valor de cada una para seleccionar las variables definitivas del modelo

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
trcntrl = trainControl(method="cv", number=10)
caret_lm_fit_m = caret::train(TOTAL_ACCIDENTES∼Ano_Base+MES+Feriados, data=Train_M_Dataset ,
                      method = "lm", trControl = trcntrl,
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
summary(caret_lm_fit_m)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -166.88  -53.87   -8.75   67.13  199.55 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3522.46      14.39 244.752  < 2e-16 ***
## Ano_Base       37.76      14.56   2.594   0.0139 *  
## MES02          65.64      25.54   2.570   0.0147 *  
## MES03         168.41      20.11   8.375 8.89e-10 ***
## MES04         110.45      20.11   5.493 3.93e-06 ***
## MES05         180.22      19.69   9.152 1.07e-10 ***
## MES06         100.25      19.80   5.064 1.42e-05 ***
## MES07         161.84      20.11   8.049 2.21e-09 ***
## MES08         208.92      19.69  10.609 2.51e-12 ***
## MES09         154.25      25.54   6.039 7.66e-07 ***
## MES10         151.77      21.31   7.123 3.12e-08 ***
## MES11         110.47      19.69   5.609 2.77e-06 ***
## MES12         139.86      19.69   7.102 3.32e-08 ***
## Feriados      -57.20      26.17  -2.185   0.0358 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.71 on 34 degrees of freedom
## Multiple R-squared:  0.8539, Adjusted R-squared:  0.798 
## F-statistic: 15.29 on 13 and 34 DF,  p-value: 1.458e-10

1. REGRESION LINEAL

Total Accidentes

head(Train_M_Dataset )
##    ANO MES ACCIDENTES_GRAVES ACCIDENTES_LEVES TOTAL_ACCIDENTES Ano_mes
## 1 2014  01              1619             1372             2991 2014_01
## 2 2014  02              1776             1482             3258 2014_02
## 3 2014  03              2152             1703             3855 2014_03
## 4 2014  04              1930             1495             3425 2014_04
## 5 2014  05              2037             1649             3686 2014_05
## 6 2014  06              1977             1427             3404 2014_06
##   Ano_Base Freq_lunes Freq_martes Freq_miercoles Freq_jueves Freq_viernes
## 1        0          0           0              1           1            1
## 2        0          0           0              0           0            0
## 3        0          1           0              0           0            0
## 4        0          0           1              1           0            0
## 5        0          0           0              0           1            1
## 6        0          1           0              0           0            0
##   Freq_sabado Freq_domingo Feriados Feriados_Lunes Feriados_Otro
## 1           0            0        2              1             1
## 2           0            0        0              0             0
## 3           1            1        1              1             0
## 4           0            0        2              0             2
## 5           1            0        1              0             1
## 6           0            1        3              3             0
library(caret)
trcntrl = trainControl(method="cv", number=10)
caret_lm_fit_m = caret::train(TOTAL_ACCIDENTES∼Ano_Base+MES+Feriados, data=Train_M_Dataset ,
                      method = "lm", trControl = trcntrl,
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
summary(caret_lm_fit_m)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -166.88  -53.87   -8.75   67.13  199.55 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3522.46      14.39 244.752  < 2e-16 ***
## Ano_Base       37.76      14.56   2.594   0.0139 *  
## MES02          65.64      25.54   2.570   0.0147 *  
## MES03         168.41      20.11   8.375 8.89e-10 ***
## MES04         110.45      20.11   5.493 3.93e-06 ***
## MES05         180.22      19.69   9.152 1.07e-10 ***
## MES06         100.25      19.80   5.064 1.42e-05 ***
## MES07         161.84      20.11   8.049 2.21e-09 ***
## MES08         208.92      19.69  10.609 2.51e-12 ***
## MES09         154.25      25.54   6.039 7.66e-07 ***
## MES10         151.77      21.31   7.123 3.12e-08 ***
## MES11         110.47      19.69   5.609 2.77e-06 ***
## MES12         139.86      19.69   7.102 3.32e-08 ***
## Feriados      -57.20      26.17  -2.185   0.0358 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99.71 on 34 degrees of freedom
## Multiple R-squared:  0.8539, Adjusted R-squared:  0.798 
## F-statistic: 15.29 on 13 and 34 DF,  p-value: 1.458e-10
caret_lm_fit_m
## Linear Regression 
## 
## 48 samples
##  3 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 44, 44, 43, 43, 43, 42, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   125.2733  0.6523645  102.8469
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_lm_m<-predict(caret_lm_fit_m,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_lm_m<-mean((Train_M_Dataset $TOTAL_ACCIDENTES-y_tr_pred_lm_m)^2) # calcula el mse de entrenamiento
RMSE_tr_lm_m = sqrt(mse_tr_lm_m)
mse_tr_lm_m
## [1] 7042.377
RMSE_tr_lm_m
## [1] 83.91887

Calculo MSE y RMSE para los datos de validación

y_test_pred_lm_m<-predict(caret_lm_fit_m,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_lm_m<-mean((Test_M_Dataset$TOTAL_ACCIDENTES-y_test_pred_lm_m)^2) # calcula el mse de entrenamiento
RMSE_test_lm_m = sqrt(mse_test_lm_m)
mse_test_lm_m
## [1] 42167.71
RMSE_test_lm_m
## [1] 205.3478

Predicción en la muestra

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:reshape':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_lm_m,
            name='Modelo lm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_lm_m,
            name='Modelo lm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Accidentes Graves

trcntrl = trainControl(method="cv", number=10)
caret_lm_fit_m_m = caret::train(ACCIDENTES_GRAVES∼MES+Feriados+Freq_domingo, data=Train_M_Dataset ,
                      method = "lm", trControl = trcntrl,
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
summary(caret_lm_fit_m_m)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -131.750  -48.669    2.574   37.767  138.250 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1962.02      11.31 173.434  < 2e-16 ***
## MES02           32.55      20.09   1.620 0.114429    
## MES03           89.88      15.82   5.681 2.24e-06 ***
## MES04           49.54      15.85   3.126 0.003616 ** 
## MES05           88.47      15.48   5.715 2.02e-06 ***
## MES06           48.79      15.72   3.105 0.003825 ** 
## MES07           79.47      15.82   5.023 1.60e-05 ***
## MES08          115.28      15.48   7.448 1.22e-08 ***
## MES09           79.26      20.09   3.946 0.000378 ***
## MES10           63.49      16.80   3.778 0.000609 ***
## MES11           44.20      15.48   2.855 0.007275 ** 
## MES12           43.59      15.59   2.797 0.008438 ** 
## Feriados       -30.96      21.05  -1.471 0.150498    
## Freq_domingo     3.54      12.76   0.278 0.783058    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.38 on 34 degrees of freedom
## Multiple R-squared:  0.739,  Adjusted R-squared:  0.6392 
## F-statistic: 7.405 on 13 and 34 DF,  p-value: 1.342e-06
caret_lm_fit_m_m
## Linear Regression 
## 
## 48 samples
##  3 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 43, 43, 43, 45, 43, 43, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   89.32155  0.5245695  74.06608
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_lm_m_m<-predict(caret_lm_fit_m_m,Train_M_Dataset [,c("Freq_domingo","MES","Feriados")])
mse_tr_lm_m_m<-mean((Train_M_Dataset $ACCIDENTES_GRAVES-y_tr_pred_lm_m_m)^2) # calcula el mse de entrenamiento
RMSE_tr_lm_m_m = sqrt(mse_tr_lm_m_m)
mse_tr_lm_m_m
## [1] 4351.268
RMSE_tr_lm_m_m
## [1] 65.96414

Calculo MSE y RMSE para los datos de validación

y_test_pred_lm_m_m<-predict(caret_lm_fit_m_m,Test_M_Dataset[,c("Freq_domingo","MES","Feriados")])
mse_test_lm_m_m<-mean((Test_M_Dataset$ACCIDENTES_GRAVES-y_test_pred_lm_m_m)^2) # calcula el mse de entrenamiento
RMSE_test_lm_m_m = sqrt(mse_test_lm_m_m)
mse_test_lm_m_m
## [1] 40538.09
RMSE_test_lm_m_m
## [1] 201.3407

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_lm_m_m,
            name='Modelo lm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes graves"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         split = ~ANO,
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_lm_m_m,
            name='Modelo lm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes graves"),
         legend = list(x = 1, y = 0.4))

Accidentes Leves

trcntrl = trainControl(method="cv", number=10)
caret_lm_fit_m_sd = caret::train(ACCIDENTES_LEVES∼Ano_Base+MES+Feriados, data=Train_M_Dataset ,
                      method = "lm", trControl = trcntrl,
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
summary(caret_lm_fit_m_sd)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -123.098  -41.439    3.148   38.715  125.223 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1560.44      10.22 152.709  < 2e-16 ***
## Ano_Base       27.26      10.34   2.637 0.012509 *  
## MES02          33.82      18.13   1.865 0.070796 .  
## MES03          78.46      14.28   5.495 3.90e-06 ***
## MES04          61.35      14.28   4.297 0.000137 ***
## MES05          91.75      13.98   6.562 1.62e-07 ***
## MES06          52.00      14.06   3.699 0.000759 ***
## MES07          82.30      14.28   5.764 1.74e-06 ***
## MES08          93.64      13.98   6.697 1.09e-07 ***
## MES09          75.72      18.13   4.176 0.000195 ***
## MES10          88.14      15.13   5.826 1.45e-06 ***
## MES11          66.27      13.98   4.739 3.73e-05 ***
## MES12          96.78      13.98   6.922 5.62e-08 ***
## Feriados      -26.70      18.58  -1.437 0.159925    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70.8 on 34 degrees of freedom
## Multiple R-squared:  0.7479, Adjusted R-squared:  0.6516 
## F-statistic: 7.761 on 13 and 34 DF,  p-value: 7.884e-07
caret_lm_fit_m_sd
## Linear Regression 
## 
## 48 samples
##  3 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 43, 43, 43, 42, 44, 43, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   79.92116  0.5284022  69.41677
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_lm_m_sd<-predict(caret_lm_fit_m_sd,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_lm_m_sd<-mean((Train_M_Dataset $ACCIDENTES_LEVES-y_tr_pred_lm_m_sd)^2) # calcula el mse de entrenamiento
RMSE_tr_lm_m_sd = sqrt(mse_tr_lm_m_sd)
mse_tr_lm_m_sd
## [1] 3550.129
RMSE_tr_lm_m_sd
## [1] 59.58296

Calculo MSE y RMSE para los datos de validación

y_test_pred_lm_m_sd<-predict(caret_lm_fit_m_sd,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_lm_m_sd<-mean((Test_M_Dataset$ACCIDENTES_LEVES-y_test_pred_lm_m_sd)^2) # calcula el mse de entrenamiento
RMSE_test_lm_m_sd = sqrt(mse_test_lm_m_sd)
mse_test_lm_m_sd
## [1] 4435.997
RMSE_test_lm_m_sd
## [1] 66.60328

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_lm_m_sd,
            name='Modelo lm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total Accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes leves"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_lm_m_sd,
            name='Modelo lm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes leves"),
         legend = list(x = 1, y = 0.4))

Resumen Modelos Regresión lineal para los diferentes tipos de accidente

Tipo_de_accidentes= c("Total Accidentes","Accidentes graves","Accidentes leves")
RMSE_Train_lm = round(c(RMSE_tr_lm_m,RMSE_tr_lm_m_m,RMSE_tr_lm_m_sd), 3)
RMSE_Test_lm = round(c(RMSE_test_lm_m,RMSE_test_lm_m_m,RMSE_test_lm_m_sd),3)

Tabla_lm = data.frame (cbind(Tipo_de_accidentes,RMSE_Train_lm,RMSE_Test_lm))
Tabla_lm
##   Tipo_de_accidentes RMSE_Train_lm RMSE_Test_lm
## 1   Total Accidentes        83.919      205.348
## 2  Accidentes graves        65.964      201.341
## 3   Accidentes leves        59.583       66.603

2. KNN

head(Train_M_Dataset )
##    ANO MES ACCIDENTES_GRAVES ACCIDENTES_LEVES TOTAL_ACCIDENTES Ano_mes
## 1 2014  01              1619             1372             2991 2014_01
## 2 2014  02              1776             1482             3258 2014_02
## 3 2014  03              2152             1703             3855 2014_03
## 4 2014  04              1930             1495             3425 2014_04
## 5 2014  05              2037             1649             3686 2014_05
## 6 2014  06              1977             1427             3404 2014_06
##   Ano_Base Freq_lunes Freq_martes Freq_miercoles Freq_jueves Freq_viernes
## 1        0          0           0              1           1            1
## 2        0          0           0              0           0            0
## 3        0          1           0              0           0            0
## 4        0          0           1              1           0            0
## 5        0          0           0              0           1            1
## 6        0          1           0              0           0            0
##   Freq_sabado Freq_domingo Feriados Feriados_Lunes Feriados_Otro
## 1           0            0        2              1             1
## 2           0            0        0              0             0
## 3           1            1        1              1             0
## 4           0            0        2              0             2
## 5           1            0        1              0             1
## 6           0            1        3              3             0

Total Accidentes

library(caret)
trcntrl = trainControl(method="cv", number=10)
caret_knn_fit_m = caret::train(TOTAL_ACCIDENTES∼Ano_Base+MES+Feriados, data=Train_M_Dataset ,
                      method = "knn", trControl = trcntrl,
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
summary(caret_knn_fit_m)
##             Length Class      Mode     
## learn        2     -none-     list     
## k            1     -none-     numeric  
## theDots      0     -none-     list     
## xNames      13     -none-     character
## problemType  1     -none-     character
## tuneValue    1     data.frame list     
## obsLevels    1     -none-     logical  
## param        0     -none-     list
caret_knn_fit_m
## k-Nearest Neighbors 
## 
## 48 samples
##  3 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 43, 43, 43, 44, 44, 44, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  286.9993  0.4149184  258.0058
##    7  293.9042  0.4854303  263.4552
##    9  234.7895  0.3648568  196.9260
##   11  230.5185  0.3211644  195.6398
##   13  214.3056  0.3830598  176.3921
##   15  215.5572  0.3325944  176.8265
##   17  218.2165  0.4297943  179.6711
##   19  220.7714  0.3912014  179.4098
##   21  217.7822  0.4217734  175.7535
##   23  217.2215  0.4266236  174.1418
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_knn_m<-predict(caret_knn_fit_m,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_knn_m<-mean((Train_M_Dataset $TOTAL_ACCIDENTES-y_tr_pred_knn_m)^2) # calcula el mse de entrenamiento
RMSE_tr_knn_m = sqrt(mse_tr_knn_m)
mse_tr_knn_m
## [1] 46788.17
RMSE_tr_knn_m
## [1] 216.3057

Calculo MSE y RMSE para los datos de validación

y_test_pred_knn_m<-predict(caret_knn_fit_m,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_knn_m<-mean((Test_M_Dataset$TOTAL_ACCIDENTES-y_test_pred_knn_m)^2) # calcula el mse de entrenamiento
RMSE_test_knn_m = sqrt(mse_test_knn_m)
mse_test_knn_m
## [1] 31547.02
RMSE_test_knn_m
## [1] 177.6148

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_knn_m,
            name='Modelo knn',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total Accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_knn_m,
            name='Modelo knn',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total Accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Accidentes graves

trcntrl = trainControl(method="cv", number=10)
caret_knn_fit_m_m = caret::train(ACCIDENTES_GRAVES∼Ano_Base+MES+Feriados, data=Train_M_Dataset ,
                      method = "knn", trControl = trcntrl,
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
summary(caret_knn_fit_m_m)
##             Length Class      Mode     
## learn        2     -none-     list     
## k            1     -none-     numeric  
## theDots      0     -none-     list     
## xNames      13     -none-     character
## problemType  1     -none-     character
## tuneValue    1     data.frame list     
## obsLevels    1     -none-     logical  
## param        0     -none-     list
caret_knn_fit_m_m
## k-Nearest Neighbors 
## 
## 48 samples
##  3 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 42, 44, 44, 44, 43, 43, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  151.7366  0.2637437  131.7257
##    7  154.0235  0.3431891  130.8144
##    9  133.1727  0.2492413  113.7180
##   11  130.2438  0.2492649  111.3584
##   13  126.0656  0.3005884  108.2311
##   15  123.8132  0.3114384  105.9967
##   17  124.8157  0.2806645  107.4023
##   19  125.4078  0.2615448  107.4120
##   21  125.8452  0.3329166  109.3247
##   23  125.4787  0.3010697  108.4377
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_knn_m_m<-predict(caret_knn_fit_m_m,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_knn_m_m<-mean((Train_M_Dataset $ACCIDENTES_GRAVES-y_tr_pred_knn_m_m)^2) # calcula el mse de entrenamiento
RMSE_tr_knn_m_m = sqrt(mse_tr_knn_m_m)
mse_tr_knn_m_m
## [1] 16520.6
RMSE_tr_knn_m_m
## [1] 128.5325

Calculo MSE y RMSE para los datos de validación

y_test_pred_knn_m_m<-predict(caret_knn_fit_m_m,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_knn_m_m<-mean((Test_M_Dataset$ACCIDENTES_GRAVES-y_test_pred_knn_m_m)^2) # calcula el mse de entrenamiento
RMSE_test_knn_m_m = sqrt(mse_test_knn_m_m)
mse_test_knn_m_m
## [1] 27160.38
RMSE_test_knn_m_m
## [1] 164.8041

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_knn_m_m,
            name='Modelo knn',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total Accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes graves"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_knn_m_m,
            name='Modelo knn',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes graves"),
         legend = list(x = 1, y = 0.4))

Accidentes Leves

trcntrl = trainControl(method="cv", number=10)
caret_knn_fit_m_sd = caret::train(ACCIDENTES_LEVES∼Ano_Base+MES+Feriados, data=Train_M_Dataset ,
                      method = "knn", trControl = trcntrl,
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
summary(caret_knn_fit_m_sd)
##             Length Class      Mode     
## learn        2     -none-     list     
## k            1     -none-     numeric  
## theDots      0     -none-     list     
## xNames      13     -none-     character
## problemType  1     -none-     character
## tuneValue    1     data.frame list     
## obsLevels    1     -none-     logical  
## param        0     -none-     list
caret_knn_fit_m_sd
## k-Nearest Neighbors 
## 
## 48 samples
##  3 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 45, 43, 44, 44, 42, 43, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE      
##    5  165.6046  0.2967434  144.08699
##    7  159.1989  0.3370283  136.97538
##    9  127.1377  0.5456573  105.57562
##   11  120.4366  0.5795465   99.12592
##   13  118.7365  0.5605990   98.01849
##   15  114.8793  0.4136792   93.07322
##   17  114.2984  0.3199909   93.06156
##   19  113.8753  0.3135036   92.26790
##   21  114.2376  0.2793133   92.18137
##   23  114.6177  0.2133970   90.76607
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 19.

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_knn_m_sd<-predict(caret_knn_fit_m_sd,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_knn_m_sd<-mean((Train_M_Dataset $ACCIDENTES_LEVES-y_tr_pred_knn_m_sd)^2) # calcula el mse de entrenamiento
RMSE_tr_knn_m_sd = sqrt(mse_tr_knn_m_sd)
mse_tr_knn_m_sd
## [1] 13114.59
RMSE_tr_knn_m_sd
## [1] 114.519

Calculo MSE y RMSE para los datos de validación

y_test_pred_knn_m_sd<-predict(caret_knn_fit_m_sd,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_knn_m_sd<-mean((Test_M_Dataset$ACCIDENTES_LEVES-y_test_pred_knn_m_sd)^2) # calcula el mse de entrenamiento
RMSE_test_knn_m_sd = sqrt(mse_test_knn_m_sd)
mse_test_knn_m_sd
## [1] 15803.92
RMSE_test_knn_m_sd
## [1] 125.7137

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_knn_m_sd,
            name='Modelo knn',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total Accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_knn_m_sd,
            name='Modelo knn',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Resumen Modelos KNN para los diferentes tipos de accidente

Tipo_de_accidentes= c("Total Accidentes","Accidentes graves","Accidentes leves")
RMSE_Train_knn = round(c(RMSE_tr_knn_m,RMSE_tr_knn_m_m,RMSE_tr_knn_m_sd), 3)
RMSE_Test_knn = round(c(RMSE_test_knn_m,RMSE_test_knn_m_m,RMSE_test_knn_m_sd),3)

Tabla_knn = data.frame (cbind(Tipo_de_accidentes,RMSE_Train_knn,RMSE_Test_knn))
Tabla_knn
##   Tipo_de_accidentes RMSE_Train_knn RMSE_Test_knn
## 1   Total Accidentes        216.306       177.615
## 2  Accidentes graves        128.533       164.804
## 3   Accidentes leves        114.519       125.714

3. MODELO LINEAL GENERALIZADO

Total Accidentes

glm_fit_m<-glm(TOTAL_ACCIDENTES∼Ano_Base+MES+Feriados, data=Train_M_Dataset , family = "poisson")
summary(glm_fit_m)
## 
## Call:
## glm(formula = TOTAL_ACCIDENTES ~ Ano_Base + MES + Feriados, family = "poisson", 
##     data = Train_M_Dataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7770  -0.9136  -0.1718   1.1553   3.2824  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  8.040802   0.013490 596.073  < 2e-16 ***
## Ano_Base     0.021498   0.004872   4.413 1.02e-05 ***
## MES02        0.076534   0.015886   4.818 1.45e-06 ***
## MES03        0.181253   0.012534  14.461  < 2e-16 ***
## MES04        0.122707   0.012707   9.656  < 2e-16 ***
## MES05        0.193291   0.012276  15.745  < 2e-16 ***
## MES06        0.111670   0.012573   8.882  < 2e-16 ***
## MES07        0.174897   0.012542  13.944  < 2e-16 ***
## MES08        0.220877   0.012200  18.104  < 2e-16 ***
## MES09        0.166032   0.015686  10.585  < 2e-16 ***
## MES10        0.164539   0.013244  12.423  < 2e-16 ***
## MES11        0.122828   0.012477   9.844  < 2e-16 ***
## MES12        0.153123   0.012389  12.359  < 2e-16 ***
## Feriados    -0.018265   0.004898  -3.729 0.000192 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 667.240  on 47  degrees of freedom
## Residual deviance:  94.222  on 34  degrees of freedom
## AIC: 602.36
## 
## Number of Fisher Scoring iterations: 3
glm_fit_m
## 
## Call:  glm(formula = TOTAL_ACCIDENTES ~ Ano_Base + MES + Feriados, family = "poisson", 
##     data = Train_M_Dataset)
## 
## Coefficients:
## (Intercept)     Ano_Base        MES02        MES03        MES04  
##     8.04080      0.02150      0.07653      0.18125      0.12271  
##       MES05        MES06        MES07        MES08        MES09  
##     0.19329      0.11167      0.17490      0.22088      0.16603  
##       MES10        MES11        MES12     Feriados  
##     0.16454      0.12283      0.15312     -0.01826  
## 
## Degrees of Freedom: 47 Total (i.e. Null);  34 Residual
## Null Deviance:       667.2 
## Residual Deviance: 94.22     AIC: 602.4

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_glm_m<-predict(glm_fit_m,Train_M_Dataset [,c("Ano_Base","MES","Feriados")],type="response")
mse_tr_glm_m<-mean((Train_M_Dataset $TOTAL_ACCIDENTES-y_tr_pred_glm_m)^2) # calcula el mse de entrenamiento
RMSE_tr_glm_m = sqrt(mse_tr_glm_m)
mse_tr_glm_m
## [1] 7022.906
RMSE_tr_glm_m
## [1] 83.80278

Calculo MSE y RMSE para los datos de validación

y_test_pred_glm_m<-predict(glm_fit_m,Test_M_Dataset[,c("Ano_Base","MES","Feriados")],type="response")
mse_test_glm_m<-mean((Train_M_Dataset $TOTAL_ACCIDENTES-y_test_pred_glm_m)^2) # calcula el mse de entrenamiento
RMSE_test_glm_m = sqrt(mse_test_glm_m)
mse_test_glm_m
## [1] 12294.85
RMSE_test_glm_m
## [1] 110.8821

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_glm_m,
            name='Modelo glm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_glm_m,
            name='Modelo glm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total Accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Accidentes Graves

glm_fit_m_m<-glm(ACCIDENTES_GRAVES∼Ano_Base+MES+Feriados, data=Train_M_Dataset , family = "poisson")
summary(glm_fit_m_m)
## 
## Call:
## glm(formula = ACCIDENTES_GRAVES ~ Ano_Base + MES + Feriados, 
##     family = "poisson", data = Train_M_Dataset)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.82869  -1.29977  -0.06613   0.96614   2.88852  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.482694   0.017965 416.510  < 2e-16 ***
## Ano_Base     0.010862   0.006527   1.664  0.09612 .  
## MES02        0.065540   0.021162   3.097  0.00195 ** 
## MES03        0.170614   0.016646  10.250  < 2e-16 ***
## MES04        0.097063   0.016929   5.734 9.83e-09 ***
## MES05        0.168463   0.016347  10.305  < 2e-16 ***
## MES06        0.095051   0.016717   5.686 1.30e-08 ***
## MES07        0.152485   0.016701   9.130  < 2e-16 ***
## MES08        0.214415   0.016178  13.253  < 2e-16 ***
## MES09        0.149577   0.020911   7.153 8.48e-13 ***
## MES10        0.123908   0.017709   6.997 2.61e-12 ***
## MES11        0.087723   0.016659   5.266 1.40e-07 ***
## MES12        0.085598   0.016667   5.136 2.81e-07 ***
## Feriados    -0.017289   0.006556  -2.637  0.00836 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 411.53  on 47  degrees of freedom
## Residual deviance: 105.21  on 34  degrees of freedom
## AIC: 585.25
## 
## Number of Fisher Scoring iterations: 3
glm_fit_m_m
## 
## Call:  glm(formula = ACCIDENTES_GRAVES ~ Ano_Base + MES + Feriados, 
##     family = "poisson", data = Train_M_Dataset)
## 
## Coefficients:
## (Intercept)     Ano_Base        MES02        MES03        MES04  
##     7.48269      0.01086      0.06554      0.17061      0.09706  
##       MES05        MES06        MES07        MES08        MES09  
##     0.16846      0.09505      0.15249      0.21441      0.14958  
##       MES10        MES11        MES12     Feriados  
##     0.12391      0.08772      0.08560     -0.01729  
## 
## Degrees of Freedom: 47 Total (i.e. Null);  34 Residual
## Null Deviance:       411.5 
## Residual Deviance: 105.2     AIC: 585.2

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_glm_m_m<-predict(glm_fit_m_m,Train_M_Dataset [,c("Ano_Base","MES","Feriados","Freq_viernes","Feriados_Lunes")],type="response")
mse_tr_glm_m_m<-mean((Train_M_Dataset $ACCIDENTES_GRAVES-y_tr_pred_glm_m_m)^2) # calcula el mse de entrenamiento
RMSE_tr_glm_m_m = sqrt(mse_tr_glm_m_m)
mse_tr_glm_m_m
## [1] 4256.688
RMSE_tr_glm_m_m
## [1] 65.2433

Calculo MSE y RMSE para los datos de validación

y_test_pred_glm_m_m<-predict(glm_fit_m_m,Test_M_Dataset[,c("Ano_Base","MES","Feriados")],type="response")
mse_test_glm_m_m<-mean((Train_M_Dataset $ACCIDENTES_GRAVES-y_test_pred_glm_m_m)^2) # calcula el mse de entrenamiento
RMSE_test_glm_m_m = sqrt(mse_test_glm_m_m)
mse_test_glm_m_m
## [1] 5190.199
RMSE_test_glm_m_m
## [1] 72.04304

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_glm_m_m,
            name='Modelo glm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes graves"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_glm_m_m,
            name='Modelo glm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes graves"),
         legend = list(x = 1, y = 0.4))

Accidentes Leves

glm_fit_m_sd<-glm(ACCIDENTES_LEVES∼Ano_Base+MES+Feriados, data=Train_M_Dataset , family = "poisson")
summary(glm_fit_m_sd)
## 
## Call:
## glm(formula = ACCIDENTES_LEVES ~ Ano_Base + MES + Feriados, family = "poisson", 
##     data = Train_M_Dataset)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -3.13147  -1.07468   0.05817   0.94150   3.12436  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.191696   0.020429 352.039  < 2e-16 ***
## Ano_Base     0.034873   0.007320   4.764 1.90e-06 ***
## MES02        0.091020   0.024048   3.785 0.000154 ***
## MES03        0.195228   0.019047  10.250  < 2e-16 ***
## MES04        0.155753   0.019237   8.096 5.66e-16 ***
## MES05        0.225331   0.018595  12.118  < 2e-16 ***
## MES06        0.133295   0.019079   6.986 2.82e-12 ***
## MES07        0.203902   0.018998  10.733  < 2e-16 ***
## MES08        0.229392   0.018578  12.348  < 2e-16 ***
## MES09        0.187511   0.023723   7.904 2.69e-15 ***
## MES10        0.216001   0.019962  10.820  < 2e-16 ***
## MES11        0.167601   0.018838   8.897  < 2e-16 ***
## MES12        0.236247   0.018550  12.736  < 2e-16 ***
## Feriados    -0.019486   0.007369  -2.644 0.008182 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 443.58  on 47  degrees of freedom
## Residual deviance: 109.38  on 34  degrees of freedom
## AIC: 578.38
## 
## Number of Fisher Scoring iterations: 3
glm_fit_m_sd
## 
## Call:  glm(formula = ACCIDENTES_LEVES ~ Ano_Base + MES + Feriados, family = "poisson", 
##     data = Train_M_Dataset)
## 
## Coefficients:
## (Intercept)     Ano_Base        MES02        MES03        MES04  
##     7.19170      0.03487      0.09102      0.19523      0.15575  
##       MES05        MES06        MES07        MES08        MES09  
##     0.22533      0.13330      0.20390      0.22939      0.18751  
##       MES10        MES11        MES12     Feriados  
##     0.21600      0.16760      0.23625     -0.01949  
## 
## Degrees of Freedom: 47 Total (i.e. Null);  34 Residual
## Null Deviance:       443.6 
## Residual Deviance: 109.4     AIC: 578.4

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_glm_m_sd<-predict(glm_fit_m_sd,Train_M_Dataset [,c("Ano_Base","MES","Feriados")],type="response")
mse_tr_glm_m_sd<-mean((Train_M_Dataset $ACCIDENTES_LEVES-y_tr_pred_glm_m_sd)^2) # calcula el mse de entrenamiento
RMSE_tr_glm_m_sd = sqrt(mse_tr_glm_m_sd)
mse_tr_glm_m_sd
## [1] 3517.603
RMSE_tr_glm_m_sd
## [1] 59.30938

Calculo MSE y RMSE para los datos de validación

y_test_pred_glm_m_sd<-predict(glm_fit_m_sd,Test_M_Dataset[,c("Ano_Base","MES","Feriados")],type="response")
mse_test_glm_m_sd<-mean((Train_M_Dataset $ACCIDENTES_LEVES-y_test_pred_glm_m_sd)^2) # calcula el mse de entrenamiento
RMSE_test_glm_m_sd = sqrt(mse_test_glm_m_sd)
mse_test_glm_m_sd
## [1] 5528.337
RMSE_test_glm_m_sd
## [1] 74.35279

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_glm_m_sd,
            name='Modelo glm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total Accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes leves"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_glm_m_sd,
            name='Modelo glm',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total Accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes leves"),
         legend = list(x = 1, y = 0.4))

#### REsumen Modelos Regresión lineal generalizado para los diferentes tipos de accidente

Tipo_de_accidentes= c("Total Accidentes","Accidentes Graves","Accidentes Leves")
RMSE_Train_glm = round(c(RMSE_tr_glm_m,RMSE_tr_glm_m_m,RMSE_tr_glm_m_sd), 3)
RMSE_Test_glm = round(c(RMSE_test_glm_m,RMSE_test_glm_m_m,RMSE_test_glm_m_sd),3)

Tabla_glm = data.frame (cbind(Tipo_de_accidentes,RMSE_Train_glm,RMSE_Test_glm))
Tabla_glm
##   Tipo_de_accidentes RMSE_Train_glm RMSE_Test_glm
## 1   Total Accidentes         83.803       110.882
## 2  Accidentes Graves         65.243        72.043
## 3   Accidentes Leves         59.309        74.353

4. ARBOLES DE REGRESION

Total Accidentes

trcntrl = trainControl(method="cv", number=10)
caret_tree_fit_m = caret::train(TOTAL_ACCIDENTES∼Ano_Base+MES+Feriados,data=Train_M_Dataset ,
                              method = "rpart", trControl = trcntrl,
                      parms = list(split = "gini"),
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
caret_tree_fit_m
## CART 
## 
## 48 samples
##  3 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 44, 44, 44, 42, 44, 42, ... 
## Resampling results across tuning parameters:
## 
##   cp           RMSE      Rsquared   MAE     
##   0.000000000  190.7647  0.4746258  154.7190
##   0.009395662  190.7647  0.4746258  154.7190
##   0.018791325  190.7647  0.4746258  154.7190
##   0.028186987  192.6841  0.4791335  156.6079
##   0.037582649  197.2557  0.4715011  160.7902
##   0.046978312  198.6513  0.4715011  162.8287
##   0.056373974  208.6457  0.4322710  171.6781
##   0.065769636  209.5126  0.4201176  172.2044
##   0.075165299  215.6028  0.4267704  174.0021
##   0.084560961  217.7171  0.3473503  177.6980
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01879132.

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_tree_m<-predict(caret_tree_fit_m,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_tree_m<-mean((Train_M_Dataset $TOTAL_ACCIDENTES-y_tr_pred_tree_m)^2) # calcula el mse de entrenamiento
RMSE_tr_tree_m = sqrt(mse_tr_tree_m)
mse_tr_tree_m
## [1] 41776.09
RMSE_tr_tree_m
## [1] 204.392

Calculo MSE y RMSE para los datos de validación

  y_test_pred_tree_m<-predict(caret_tree_fit_m,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_tree_m<-mean((Test_M_Dataset$TOTAL_ACCIDENTES-y_test_pred_tree_m)^2) # calcula el mse de entrenamiento
RMSE_test_tree_m = sqrt(mse_test_tree_m)
mse_test_tree_m
## [1] 61023.87
RMSE_test_tree_m
## [1] 247.0301

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_tree_m,
            name='Modelo tree',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_tree_m,
            name='Modelo tree',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Accidentes graves

trcntrl = trainControl(method="cv", number=10)
caret_tree_fit_m_m = caret::train(ACCIDENTES_GRAVES∼Ano_Base+MES+Feriados,data=Train_M_Dataset ,
                              method = "rpart", trControl = trcntrl,
                      parms = list(split = "gini"),
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
caret_tree_fit_m_m
## CART 
## 
## 48 samples
##  3 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 42, 43, 43, 43, 44, 43, ... 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared    MAE     
##   0.00000000  121.4800  0.19812474  105.0612
##   0.01098192  123.1253  0.17874696  106.0382
##   0.02196385  122.2402  0.17520610  105.0382
##   0.03294577  119.3979  0.22997713  103.0443
##   0.04392770  119.3979  0.22997713  103.0443
##   0.05490962  119.3979  0.22997713  103.0443
##   0.06589155  119.3979  0.22997713  103.0443
##   0.07687347  124.5515  0.09921628  107.2660
##   0.08785540  124.5515  0.09921628  107.2660
##   0.09883732  127.4723  0.05570420  111.3363
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.06589155.

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_tree_m_m<-predict(caret_tree_fit_m_m,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_tree_m_m<-mean((Train_M_Dataset $ACCIDENTES_GRAVES-y_tr_pred_tree_m_m)^2) # calcula el mse de entrenamiento
RMSE_tr_tree_m_m = sqrt(mse_tr_tree_m_m)
mse_tr_tree_m_m
## [1] 15023.26
RMSE_tr_tree_m_m
## [1] 122.5694

Calculo MSE y RMSE para los datos de validación

y_test_pred_tree_m_m<-predict(caret_tree_fit_m_m,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_tree_m_m<-mean((Test_M_Dataset$ACCIDENTES_GRAVES-y_test_pred_tree_m_m)^2) # calcula el mse de entrenamiento
RMSE_test_tree_m_m = sqrt(mse_test_tree_m_m)
mse_test_tree_m_m
## [1] 42876.64
RMSE_test_tree_m_m
## [1] 207.0667

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_tree_m_m,
            name='Modelo tree',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_tree_m_m,
            name='Modelo tree',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Accidentes leves

trcntrl = trainControl(method="cv", number=10)
caret_tree_fit_m_sd = caret::train(ACCIDENTES_LEVES∼Ano_Base+MES+Feriados,data=Train_M_Dataset ,
                              method = "rpart", trControl = trcntrl,
                      parms = list(split = "gini"),
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
caret_tree_fit_m_sd
## CART 
## 
## 48 samples
##  3 predictor
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 43, 42, 42, 43, 44, 43, ... 
## Resampling results across tuning parameters:
## 
##   cp           RMSE      Rsquared   MAE     
##   0.000000000  115.5862  0.3980590  93.95687
##   0.007035088  114.5438  0.3320399  93.25158
##   0.014070176  112.8181  0.3448152  91.51805
##   0.021105264  112.8181  0.3448152  91.51805
##   0.028140352  112.8181  0.3448152  91.51805
##   0.035175440  112.8181  0.3448152  91.51805
##   0.042210528  112.8181  0.3448152  91.51805
##   0.049245616  119.9709  0.2316120  98.13218
##   0.056280704  121.8201  0.2334852  99.05406
##   0.063315792  121.8201  0.2334852  99.05406
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.04221053.

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_tree_m_sd<-predict(caret_tree_fit_m_sd,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_tree_m_sd<-mean((Train_M_Dataset $ACCIDENTES_LEVES-y_tr_pred_tree_m_sd)^2) # calcula el mse de entrenamiento
RMSE_tr_tree_m_sd = sqrt(mse_tr_tree_m_sd)
mse_tr_tree_m_sd
## [1] 12301.03
RMSE_tr_tree_m_sd
## [1] 110.91

Calculo MSE y RMSE para los datos de validación

  y_test_pred_tree_m_sd<-predict(caret_tree_fit_m_sd,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_tree_m_sd<-mean((Test_M_Dataset$ACCIDENTES_LEVES-y_test_pred_tree_m_sd)^2) # calcula el mse de entrenamiento
RMSE_test_tree_m_sd = sqrt(mse_test_tree_m_sd)
mse_test_tree_m_sd
## [1] 14733.52
RMSE_test_tree_m_sd
## [1] 121.3817

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_tree_m_sd,
            name='Modelo tree',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes leves"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_tree_m_sd,
            name='Modelo tree',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes leves"),
         legend = list(x = 1, y = 0.4))

Resumen Modelos Árboles de regresión para los diferentes tipos de accidente

Tipo_de_accidentes= c("Total Accidentes","Accidentes Graves","Accidentes Leves")
RMSE_Train_tree = round(c(RMSE_tr_tree_m,RMSE_tr_tree_m_m,RMSE_tr_tree_m_sd), 3)
RMSE_Test_tree = round(c(RMSE_test_tree_m,RMSE_test_tree_m_m,RMSE_test_tree_m_sd),3)

Tabla_tree = data.frame (cbind(Tipo_de_accidentes,RMSE_Train_tree,RMSE_Test_tree))
Tabla_tree
##   Tipo_de_accidentes RMSE_Train_tree RMSE_Test_tree
## 1   Total Accidentes         204.392         247.03
## 2  Accidentes Graves         122.569        207.067
## 3   Accidentes Leves          110.91        121.382

5. BOSQUE ALEATORIO

Total Accidentes

trcntrl = trainControl(method="cv", number=10)
caret_rf_fit_s = caret::train(TOTAL_ACCIDENTES∼Ano_Base+MES+Feriados, data=Train_M_Dataset ,
                      method = "rf", trControl = trcntrl,
                      prox=TRUE,allowParallel=TRUE)
summary(caret_rf_fit_s)
##                 Length Class      Mode     
## call               6   -none-     call     
## type               1   -none-     character
## predicted         48   -none-     numeric  
## mse              500   -none-     numeric  
## rsq              500   -none-     numeric  
## oob.times         48   -none-     numeric  
## importance        13   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity       2304   -none-     numeric  
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            11   -none-     list     
## coefs              0   -none-     NULL     
## y                 48   -none-     numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            13   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          1   -none-     logical  
## param              2   -none-     list
caret_rf_fit_s
## Random Forest 
## 
## 48 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 42, 45, 44, 43, 43, 43, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    173.4228  0.5547127  141.9057
##    7    161.5589  0.5967454  142.7779
##   13    161.0391  0.5736748  141.5929
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 13.
plot(caret_rf_fit_s)

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_rf_s<-predict(caret_rf_fit_s,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_rf_s<-mean((Train_M_Dataset $TOTAL_ACCIDENTES-y_tr_pred_rf_s)^2) # calcula el mse de entrenamiento
RMSE_tr_rf_s = sqrt(mse_tr_rf_s)
mse_tr_rf_s
## [1] 8425.437
RMSE_tr_rf_s
## [1] 91.79018

Calculo MSE y RMSE para los datos de validación

y_test_pred_rf_s<-predict(caret_rf_fit_s,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_rf_s<-mean((Test_M_Dataset$TOTAL_ACCIDENTES-y_test_pred_rf_s)^2) # calcula el mse de entrenamiento
RMSE_test_rf_s = sqrt(mse_test_rf_s)
mse_test_rf_s
## [1] 31896.72
RMSE_test_rf_s
## [1] 178.5965

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_rf_s,
            name='Modelo rf',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~TOTAL_ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y=  ~y_test_pred_rf_s,
            name='Modelo rf',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.4))

Accidentes Graves

trcntrl = trainControl(method="cv", number=10)
caret_rf_fit_s_m = caret::train(ACCIDENTES_GRAVES∼Ano_Base+MES+Feriados, data=Train_M_Dataset ,
                      method = "rf", trControl = trcntrl,
                      prox=TRUE,allowParallel=TRUE)
summary(caret_rf_fit_s_m)
##                 Length Class      Mode     
## call               6   -none-     call     
## type               1   -none-     character
## predicted         48   -none-     numeric  
## mse              500   -none-     numeric  
## rsq              500   -none-     numeric  
## oob.times         48   -none-     numeric  
## importance        13   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity       2304   -none-     numeric  
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            11   -none-     list     
## coefs              0   -none-     NULL     
## y                 48   -none-     numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            13   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          1   -none-     logical  
## param              2   -none-     list
caret_rf_fit_s_m
## Random Forest 
## 
## 48 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 43, 44, 43, 43, 44, 42, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    97.85408  0.6040367  83.56640
##    7    84.13450  0.6419625  72.23023
##   13    84.80614  0.6285747  71.72686
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 7.
plot(caret_rf_fit_s_m)

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_rf_s_m<-predict(caret_rf_fit_s_m,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_rf_s_m<-mean((Train_M_Dataset $ACCIDENTES_GRAVES-y_tr_pred_rf_s_m)^2) # calcula el mse de entrenamiento
RMSE_tr_rf_s_m = sqrt(mse_tr_rf_s_m)
mse_tr_rf_s_m
## [1] 4075.912
RMSE_tr_rf_s_m
## [1] 63.84287

Calculo MSE y RMSE para los datos de validación

y_test_pred_rf_s_m<-predict(caret_rf_fit_s_m,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_rf_s_m<-mean((Test_M_Dataset$ACCIDENTES_GRAVES-y_test_pred_rf_s_m)^2) # calcula el mse de entrenamiento
RMSE_test_rf_s_m = sqrt(mse_test_rf_s_m)
mse_test_rf_s_m
## [1] 41209.68
RMSE_test_rf_s_m
## [1] 203.0017

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_rf_s_m,
            name='Modelo rf',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes graves"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_GRAVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_rf_s_m,
            name='Modelo rf',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes graves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes graves"),
         legend = list(x = 1, y = 0.4))

Accidentes Leves

trcntrl = trainControl(method="cv", number=10)
caret_rf_fit_s_sd = caret::train(ACCIDENTES_LEVES∼Ano_Base+MES+Feriados, data=Train_M_Dataset ,
                      method = "rf", trControl = trcntrl,
                      prox=TRUE,allowParallel=TRUE)
summary(caret_rf_fit_s_sd)
##                 Length Class      Mode     
## call               6   -none-     call     
## type               1   -none-     character
## predicted         48   -none-     numeric  
## mse              500   -none-     numeric  
## rsq              500   -none-     numeric  
## oob.times         48   -none-     numeric  
## importance        13   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity       2304   -none-     numeric  
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            11   -none-     list     
## coefs              0   -none-     NULL     
## y                 48   -none-     numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            13   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          1   -none-     logical  
## param              2   -none-     list
caret_rf_fit_s_sd
## Random Forest 
## 
## 48 samples
##  3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 43, 43, 43, 45, 42, 43, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared  MAE     
##    2    100.95042  0.426075  83.97683
##    7     95.99575  0.468268  82.06022
##   13     95.88647  0.472629  81.18917
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 13.
plot(caret_rf_fit_s_sd)

Calculo MSE y RMSE para los datos de entrenamiento

y_tr_pred_rf_s_sd<-predict(caret_rf_fit_s_sd,Train_M_Dataset [,c("Ano_Base","MES","Feriados")])
mse_tr_rf_s_sd<-mean((Train_M_Dataset $ACCIDENTES_LEVES-y_tr_pred_rf_s_sd)^2) # calcula el mse de entrenamiento
RMSE_tr_rf_s_sd = sqrt(mse_tr_rf_s_sd)
mse_tr_rf_s_sd
## [1] 4087.071
RMSE_tr_rf_s_sd
## [1] 63.93021

Calculo MSE y RMSE para los datos de validación

y_test_pred_rf_s_sd<-predict(caret_rf_fit_s_sd,Test_M_Dataset[,c("Ano_Base","MES","Feriados")])
mse_test_rf_s_sd<-mean((Test_M_Dataset$ACCIDENTES_LEVES-y_test_pred_rf_s_sd)^2) # calcula el mse de entrenamiento
RMSE_test_rf_s_sd = sqrt(mse_test_rf_s_sd)
mse_test_rf_s_sd
## [1] 4385.53
RMSE_test_rf_s_sd
## [1] 66.22333

Predicción en la muestra

plot_ly (data=Train_M_Dataset ,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_tr_pred_rf_s_sd,
            name='Modelo rf',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes leves"),
         legend = list(x = 1, y = 0.4))

Gráfica serie 2018

plot_ly (data=Test_M_Dataset,
         x = ~Ano_mes,
         y = ~ACCIDENTES_LEVES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~y_test_pred_rf_s_sd,
            name='Modelo rf',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  layout(title='Total accidentes leves',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes leves"),
         legend = list(x = 1, y = 0.4))

Resumen Modelos Random Forest para los diferentes tipos de accidente

Tipo_de_accidentes= c("Total Accidentes","Total Graves","Total Leves")
RMSE_Train_rf = round(c(RMSE_tr_rf_s,RMSE_tr_rf_s_m,RMSE_tr_rf_s_sd), 3)
RMSE_Test_rf = round(c(RMSE_test_rf_s,RMSE_test_rf_s_m,RMSE_test_rf_s_sd),3)

Tabla_rf = data.frame (cbind(Tipo_de_accidentes,RMSE_Train_rf,RMSE_Test_rf))
Tabla_rf
##   Tipo_de_accidentes RMSE_Train_rf RMSE_Test_rf
## 1   Total Accidentes         91.79      178.597
## 2       Total Graves        63.843      203.002
## 3        Total Leves         63.93       66.223

ELECCION DEL MODELO

1. Elección del modelo para el total de accidentes

Comparación en el entrenamiento

comparacion_tr<-data.frame(Ano_mes=Total_Dataset_Freq_M$Ano_mes[Total_Dataset_Freq_M$Ano_mes<="2017_12"],
                          ACCIDENTES=Total_Dataset_Freq_M$TOTAL_ACCIDENTES[Total_Dataset_Freq_M$Ano_mes<="2017_12"],
                           lm= y_tr_pred_lm_m, 
                           knn= y_tr_pred_knn_m, 
                           glm=y_tr_pred_glm_m ,
                           arbol=y_tr_pred_tree_m,
                           rf=y_tr_pred_rf_s)
plot_ly (data=comparacion_tr,
         x = ~Ano_mes,
         y = ~ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~lm,
            name='lm',
            line=list(width=1,color= "blue"))%>%
  add_trace(y= ~knn,
            name='knn',
            line=list(width=1,color="red"))%>%
  add_trace(y= ~glm,
            name='Modelo Poisson',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  add_trace(y= ~arbol,
            name='Árbol',
            line=list(width=1,color="green"))%>%
    add_trace(y= ~rf,
            name='Bosque',
            line=list(width=1,color='rgb(255, 51, 153)'))%>%
  layout(title='Total Accidentes (Entrenamiento)',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Unidades"),
         legend = list(x = 1, y = 0.9))

Comparación en la validación

comparacion_vl<-data.frame(Ano_mes=Test_M_Dataset$Ano_mes,
                          ACCIDENTES=Test_M_Dataset$TOTAL_ACCIDENTES,
                           lm= y_test_pred_lm_m, 
                           knn= y_test_pred_knn_m, 
                           glm=y_test_pred_glm_m ,
                           arbol=y_test_pred_tree_m,
                           rf=y_test_pred_rf_s)
plot_ly (data=comparacion_vl,
         x = ~Ano_mes,
         y = ~ACCIDENTES,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~lm,
            name='lm',
            line=list(width=1,color= "blue"))%>%
  add_trace(y= ~knn,
            name='knn',
            line=list(width=1,color="red"))%>%
  add_trace(y= ~glm,
            name='Modelo Poisson',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  add_trace(y= ~arbol,
            name='Árbol',
            line=list(width=1,color="green"))%>%
  add_trace(y= ~rf,
            name='Bosque',
            line=list(width=1,color='rgb(255, 51, 153)'))%>%
  layout(title='Total Accidentes (Validación)',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.9))

Comparación con los RMSE:

Entrenamiento<-round(c(RMSE_tr_lm_m,RMSE_tr_knn_m,RMSE_tr_glm_m,RMSE_tr_tree_m,RMSE_tr_rf_s),3) 
Validacion<-round(c(RMSE_test_lm_m,RMSE_test_knn_m,RMSE_test_glm_m,RMSE_test_tree_m,RMSE_test_rf_s),3) 
nombres<-c("lm","knn","glm","árbol","bosque")
ResultadosRMSE<-data.frame(Entrenamiento=Entrenamiento,Validacion=Validacion)
rownames(ResultadosRMSE)<-nombres

Cálculo de la variación

ResultadosRMSE$Por_variacion<-((ResultadosRMSE$Validacion-ResultadosRMSE$Entrenamiento)/ResultadosRMSE$Entrenamiento)*100
ResultadosRMSE
##        Entrenamiento Validacion Por_variacion
## lm            83.919    205.348     144.69786
## knn          216.306    177.615     -17.88716
## glm           83.803    110.882      32.31269
## árbol        204.392    247.030      20.86089
## bosque        91.790    178.597      94.57130

2. Elección del modelo para Accidentes graves

Comparación en el entrenamiento

comparacion_tr<-data.frame(Ano_mes=Total_Dataset_Freq_M$Ano_mes[Total_Dataset_Freq_M$Ano_mes<="2017_12"],
                          ACCIDENTESG=Total_Dataset_Freq_M$ACCIDENTES_GRAVES[Total_Dataset_Freq_M$Ano_mes<="2017_12"],
                           lm= y_tr_pred_lm_m_m, 
                           knn= y_tr_pred_knn_m_m, 
                           glm=y_tr_pred_glm_m_m,
                           arbol=y_tr_pred_tree_m_m,
                           rf=y_tr_pred_rf_s_m)
plot_ly (data=comparacion_tr,
         x = ~Ano_mes,
         y = ~ACCIDENTESG,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~lm,
            name='lm',
            line=list(width=1,color= "blue"))%>%
  add_trace(y= ~knn,
            name='knn',
            line=list(width=1,color="red"))%>%
  add_trace(y= ~glm,
            name='Modelo Poisson',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  add_trace(y= ~arbol,
            name='Árbo',
            line=list(width=1,color="green"))%>%
    add_trace(y= ~rf,
            name='Bosque',
            line=list(width=1,color='rgb(255, 51, 153)'))%>%
  layout(title='Accidentes graves (Entrenamiento)',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.9))

Comparación en la validación

comparacion_vl<-data.frame(Ano_mes=Test_M_Dataset$Ano_mes,
                          ACCIDENTESG=Test_M_Dataset$ACCIDENTES_GRAVES,
                           lm= y_test_pred_lm_m_m, 
                           knn= y_test_pred_knn_m_m, 
                           glm=y_test_pred_glm_m_m,
                           arbol=y_test_pred_tree_m_m,
                           rf=y_test_pred_rf_s_m)
plot_ly (data=comparacion_vl,
         x = ~Ano_mes,
         y = ~ACCIDENTESG,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~lm,
            name='lm',
            line=list(width=1,color= "blue"))%>%
  add_trace(y= ~knn,
            name='knn',
            line=list(width=1,color="red"))%>%
  add_trace(y= ~glm,
            name='Modelo Poisson',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  add_trace(y= ~arbol,
            name='Árbol',
            line=list(width=1,color="green"))%>%
  add_trace(y= ~rf,
            name='Bosque',
            line=list(width=1,color='rgb(255, 51, 153)'))%>%
  layout(title='Accidentes graves (Validación)',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.9))

Comparación con los RMSE:

Entrenamiento<-round(c(RMSE_tr_lm_m_m,RMSE_tr_knn_m_m,RMSE_tr_glm_m_m,RMSE_tr_tree_m_m,RMSE_tr_rf_s_m),3) 
Validacion<-round(c(RMSE_test_lm_m_m,RMSE_test_knn_m_m,RMSE_test_glm_m_m,RMSE_test_tree_m_m,RMSE_test_rf_s_m),3) 
nombres<-c("lm","knn","glm","árbol","bosque")
ResultadosRMSE<-data.frame(Entrenamiento=Entrenamiento,Validacion=Validacion)
rownames(ResultadosRMSE)<-nombres

Cálculo de la variación

ResultadosRMSE$Variacion<-((ResultadosRMSE$Validacion-ResultadosRMSE$Entrenamiento)/ResultadosRMSE$Entrenamiento)*100
ResultadosRMSE
##        Entrenamiento Validacion Variacion
## lm            65.964    201.341 205.22861
## knn          128.533    164.804  28.21921
## glm           65.243     72.043  10.42257
## árbol        122.569    207.067  68.93913
## bosque        63.843    203.002 217.97065

3. Elección del modelo para Accidentes leves

Comparación en el entrenamiento

comparacion_tr<-data.frame(Ano_mes=Total_Dataset_Freq_M$Ano_mes[Total_Dataset_Freq_M$Ano_mes<="2017_12"],
                          ACCIDENTESL=Total_Dataset_Freq_M$ACCIDENTES_LEVES[Total_Dataset_Freq_M$Ano_mes<="2017_12"],
                           lm= y_tr_pred_lm_m_sd, 
                           knn= y_tr_pred_knn_m_sd, 
                           glm=y_tr_pred_glm_m_sd,
                           arbol=y_tr_pred_tree_m_sd,
                           rf=y_tr_pred_rf_s_sd)
plot_ly (data=comparacion_tr,
         x = ~Ano_mes,
         y = ~ACCIDENTESL,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~lm,
            name='lm',
            line=list(width=1,color= "blue"))%>%
  add_trace(y= ~knn,
            name='knn',
            line=list(width=1,color="red"))%>%
  add_trace(y= ~glm,
            name='Modelo Poisson',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  add_trace(y= ~arbol,
            name='Árbo',
            line=list(width=1,color="green"))%>%
    add_trace(y= ~rf,
            name='Bosque',
            line=list(width=1,color='rgb(255, 51, 153)'))%>%
  layout(title='Accidentes leves (Entrenamiento)',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.9))

Comparación en la validación

comparacion_vl<-data.frame(Ano_mes=Test_M_Dataset$Ano_mes,
                          ACCIDENTESL=Test_M_Dataset$ACCIDENTES_GRAVES,
                           lm= y_test_pred_lm_m_sd, 
                           knn= y_test_pred_knn_m_sd, 
                           glm=y_test_pred_glm_m_sd,
                           arbol=y_test_pred_tree_m_sd,
                           rf=y_test_pred_rf_s_sd)
plot_ly (data=comparacion_vl,
         x = ~Ano_mes,
         y = ~ACCIDENTESL,
         type = "scatter" ,mode = "lines",
         name='Real',
         line=list(width=1,color='rgb(205, 12, 24)'))%>%
  add_trace(y= ~lm,
            name='lm',
            line=list(width=1,color= "blue"))%>%
  add_trace(y= ~knn,
            name='knn',
            line=list(width=1,color="red"))%>%
  add_trace(y= ~glm,
            name='Modelo Poisson',
            line=list(width=1,color='rgb(22, 96, 167)'))%>%
  add_trace(y= ~arbol,
            name='Árbol',
            line=list(width=1,color="green"))%>%
  add_trace(y= ~rf,
            name='Bosque',
            line=list(width=1,color='rgb(255, 51, 153)'))%>%
  layout(title='Accidentes lesves (Validación)',
         xaxis=list(title="Fecha"),
         yaxis=list(title="Accidentes"),
         legend = list(x = 1, y = 0.9))

Comparación con los RMSE:

Entrenamiento<-round(c(RMSE_tr_lm_m_sd,RMSE_tr_knn_m_sd,RMSE_tr_glm_m_sd,RMSE_tr_tree_m_sd,RMSE_tr_rf_s_sd),3) 
Validacion<-round(c(RMSE_test_lm_m_sd,RMSE_test_knn_m_sd,RMSE_test_glm_m_sd,RMSE_test_tree_m_sd,RMSE_test_rf_s_sd),3)
nombres<-c("lm","knn","glm","árbol","bosque")
ResultadosRMSE<-data.frame(Entrenamiento=Entrenamiento,Validacion=Validacion)
rownames(ResultadosRMSE)<-nombres

Cálculo de la variación

ResultadosRMSE$Variacion<-((ResultadosRMSE$Validacion-ResultadosRMSE$Entrenamiento)/ResultadosRMSE$Entrenamiento)*100
ResultadosRMSE
##        Entrenamiento Validacion Variacion
## lm            59.583     66.603 11.781884
## knn          114.519    125.714  9.775670
## glm           59.309     74.353 25.365459
## árbol        110.910    121.382  9.441890
## bosque        63.930     66.223  3.586735

Modelos elegidos

Teniendo como criterio el mínimo RMSE en la muestra de validación se eligen los siguientes modelos:

Modelo de regresión lineal generalizado para predición de Total Accidentes

Se ajusta el modelo con todos los datos desde el 01-01-2014 al 31-12-2018

glm_fit_m_final<-glm(TOTAL_ACCIDENTES∼Ano_Base+MES+Feriados, data=Total_Dataset_Freq_M , family = "poisson")
summary(glm_fit_m_final)
## 
## Call:
## glm(formula = TOTAL_ACCIDENTES ~ Ano_Base + MES + Feriados, family = "poisson", 
##     data = Total_Dataset_Freq_M)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6894  -1.3880   0.0389   1.4887   3.7186  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  8.050094   0.012016 669.974  < 2e-16 ***
## Ano_Base     0.003067   0.004468   0.686    0.492    
## MES02        0.068058   0.014110   4.823 1.41e-06 ***
## MES03        0.172526   0.011085  15.564  < 2e-16 ***
## MES04        0.115619   0.011700   9.882  < 2e-16 ***
## MES05        0.189865   0.011013  17.240  < 2e-16 ***
## MES06        0.111156   0.011254   9.877  < 2e-16 ***
## MES07        0.163301   0.011193  14.589  < 2e-16 ***
## MES08        0.215948   0.010949  19.724  < 2e-16 ***
## MES09        0.152463   0.013939  10.938  < 2e-16 ***
## MES10        0.161179   0.011834  13.621  < 2e-16 ***
## MES11        0.118019   0.011197  10.540  < 2e-16 ***
## MES12        0.150864   0.011112  13.577  < 2e-16 ***
## Feriados    -0.020655   0.004278  -4.828 1.38e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 834.17  on 59  degrees of freedom
## Residual deviance: 191.60  on 46  degrees of freedom
## AIC: 819.23
## 
## Number of Fisher Scoring iterations: 3

Se guardan el modelo en un objeto de r

saveRDS(glm_fit_m_final,"../Modelos/Prediccion_Total_Mensual.rds")
Modelo_Total_mensual<-readRDS(file="../Modelos/Prediccion_Total_Mensual.rds")
Modelo de regresión lineal generalizado para predición de Accidentes Graves

Se ajusta el modelo con todos los datos desde el 01-01-2014 al 31-12-2018

glm_fit_m_m_final<-glm(ACCIDENTES_GRAVES∼Ano_Base+MES+Feriados, data=Total_Dataset_Freq_M, family = "poisson")
summary(glm_fit_m_m_final)
## 
## Call:
## glm(formula = ACCIDENTES_GRAVES ~ Ano_Base + MES + Feriados, 
##     family = "poisson", data = Total_Dataset_Freq_M)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.4227  -1.3795   0.0799   1.2927   4.6366  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.491875   0.016027 467.444  < 2e-16 ***
## Ano_Base    -0.023352   0.006002  -3.890  0.00010 ***
## MES02        0.050832   0.018871   2.694  0.00707 ** 
## MES03        0.165011   0.014738  11.196  < 2e-16 ***
## MES04        0.087419   0.015622   5.596 2.20e-08 ***
## MES05        0.161570   0.014706  10.987  < 2e-16 ***
## MES06        0.093435   0.014997   6.230 4.66e-10 ***
## MES07        0.141215   0.014938   9.453  < 2e-16 ***
## MES08        0.195420   0.014593  13.391  < 2e-16 ***
## MES09        0.133319   0.018644   7.151 8.63e-13 ***
## MES10        0.115258   0.015882   7.257 3.95e-13 ***
## MES11        0.063384   0.015051   4.211 2.54e-05 ***
## MES12        0.079161   0.014994   5.280 1.29e-07 ***
## Feriados    -0.016373   0.005740  -2.853  0.00434 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 648.15  on 59  degrees of freedom
## Residual deviance: 297.73  on 46  degrees of freedom
## AIC: 889.58
## 
## Number of Fisher Scoring iterations: 3

Se guardan el modelo en un objeto de r

saveRDS(glm_fit_m_m_final,"../Modelos/Prediccion_Grave_Mensual.rds")
Modelo_Grave_mensual<-readRDS(file="../Modelos/Prediccion_Grave_Mensual.rds")
Modelo de regresión lineal para predición de Accidentes Leves

Se ajusta el modelo con todos los datos desde el 01-01-2014 al 31-12-2018

trcntrl = trainControl(method="cv", number=10)
caret_lm_fit_m_sd_final = caret::train(ACCIDENTES_LEVES∼Ano_Base+MES+Feriados, data=Total_Dataset_Freq_M ,
                      method = "lm", trControl = trcntrl,
                      preProcess=c("center", "scale"),
                      tuneLength = 10)
summary(caret_lm_fit_m_sd_final)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -130.356  -34.185    0.716   41.016  134.546 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1566.217      8.752 178.952  < 2e-16 ***
## Ano_Base      27.276      8.833   3.088  0.00341 ** 
## MES02         33.320     15.314   2.176  0.03474 *  
## MES03         72.649     11.989   6.060 2.36e-07 ***
## MES04         59.539     12.549   4.744 2.06e-05 ***
## MES05         91.921     11.950   7.692 8.45e-10 ***
## MES06         52.216     11.989   4.355 7.36e-05 ***
## MES07         76.734     12.103   6.340 8.94e-08 ***
## MES08         99.447     11.950   8.322 9.97e-11 ***
## MES09         71.226     15.314   4.651 2.81e-05 ***
## MES10         89.266     12.874   6.934 1.15e-08 ***
## MES11         73.972     11.950   6.190 1.50e-07 ***
## MES12         97.440     11.950   8.154 1.76e-10 ***
## Feriados     -36.197     15.652  -2.313  0.02527 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 67.79 on 46 degrees of freedom
## Multiple R-squared:  0.7544, Adjusted R-squared:  0.685 
## F-statistic: 10.87 on 13 and 46 DF,  p-value: 4.754e-10

Se guardan el modelo en un objeto de r

saveRDS(caret_lm_fit_m_sd_final,"../Modelos/Prediccion_leves_Mensual.rds")
Modelo_leves_mensual<-readRDS(file="../Modelos/Prediccion_leves_Mensual.rds")
Datos para pronóstico

Se oganizan los datos necesarios para el pronóstico de los accidentes en los años 2019, 2020 y 2021

Importación de los datos

load("../data/Dias_Especiales_Mensuales.Rda")
datos_pronostico_mensual<-Dias_Especiales_Mensuales[,c("ANO","Ano_Base","MES","Feriados")]
datos_pronostico_mensual$ANO1 <- datos_pronostico_mensual$ANO
datos_pronostico_mensual$MES1 <- datos_pronostico_mensual$MES
library(dplyr)
datos_pronostico_mensual<-unite_(datos_pronostico_mensual, "Ano_Mes", c("ANO1","MES1"))

Predicción del Total de accidentes con el modelo de regresión lineal generalizado

datos_pronostico_mensual$prediccion_Total_m<-predict(Modelo_Total_mensual,datos_pronostico_mensual[,c("ANO","Ano_Base","MES","Feriados")],type="response")

Predicción de accidentes graves con el modelo de regresión lineal generalizado

datos_pronostico_mensual$prediccion_Graves_m<-predict(Modelo_Grave_mensual,datos_pronostico_mensual[,c("ANO","Ano_Base","MES","Feriados")],type="response")

Predicción de accidentes leves con el modelo de regresión lineal

datos_pronostico_mensual$prediccion_Leves_m<-predict(Modelo_leves_mensual,datos_pronostico_mensual[,c("ANO","Ano_Base","MES","Feriados")])

Se guardan los datos de pronóstico en un objeto de r

save(datos_pronostico_mensual,file="../Modelos/datos_pronostico_mensual.Rda")